In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
import optuna
import pickle
In [2]:
file_path_train = "./Bangalore_house_prices/train.csv"
file_path_test = "./Bangalore_house_prices/test.csv"
df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4137 entries, 0 to 4136
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    4137 non-null   object 
 1   total_sqft  4137 non-null   float64
 2   bath        4137 non-null   int64  
 3   price       4137 non-null   float64
 4   bhk         4137 non-null   int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 161.7+ KB
In [3]:
# Getting target value from df_train and df_test
X_train = df_train.copy()
y_train = X_train.pop("price")

X_test = df_test.copy()
y_test = X_test.pop("price")
In [4]:
X_train.head()
Out[4]:
location total_sqft bath bhk
0 1ST PHASE JP NAGAR 1875.0 3 3
1 1ST PHASE JP NAGAR 1590.0 3 3
2 1ST PHASE JP NAGAR 1566.0 2 2
3 1ST PHASE JP NAGAR 2065.0 4 3
4 1ST PHASE JP NAGAR 1394.0 2 2
In [5]:
y_train.head()
Out[5]:
0    167.0
1    131.0
2    180.0
3    210.0
4     85.0
Name: price, dtype: float64

Creating Pipeline¶

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Getting numerical and categorical columns
numerical_cols = X_train.select_dtypes(exclude=["object"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])
In [7]:
# For eval test in XGBoost
bundle = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

bundle.fit(X_train)
eval_test = bundle.transform(X_test)

Model Building¶


In [49]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

Linear Regression¶

In [8]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

# Bundle preprocessing and modeling code in a pipeline
lr_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lr_model)
])

lr_pipe.fit(X_train, y_train)
Out[8]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', LinearRegression())])
ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 Index(['location'], dtype='object'))])
Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
In [9]:
pd.Series({
    "Train Score": lr_pipe.score(X_train, y_train),
    "Test Score": lr_pipe.score(X_test, y_test)
})
Out[9]:
Train Score    0.864645
Test Score     0.790460
dtype: float64
In [48]:
lr_cv_score = cross_val_score(lr_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", lr_cv_score, "\n",
    "Cross Validation Mean Score \t", lr_cv_score.mean(), sep=""
)
Cross Validation Score 		[0.83154931 0.88158449 0.86728667 0.78621933 0.86571685]
Cross Validation Mean Score 	0.8464713287043215

Decision Tree¶

In [11]:
from sklearn.tree import DecisionTreeRegressor

def objective(trial):
    dt_params = dict(
        criterion=trial.suggest_categorical('criterion', ["squared_error", "friedman_mse"]),
        splitter=trial.suggest_categorical('splitter', ["best", "random"]),
        max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 100, 3000)
    )
    
    dt = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor(**dt_params))
    ])
    
    return cross_val_score(dt, X_train, y_train, cv=cv).mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
dt_params = study.best_params
[I 2022-06-11 21:57:34,951] A new study created in memory with name: no-name-d7614cfa-81bb-4fff-95e9-79c699f12eac
[I 2022-06-11 21:57:36,417] Trial 0 finished with value: 0.7968530196240977 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 2075}. Best is trial 0 with value: 0.7968530196240977.
[I 2022-06-11 21:57:38,080] Trial 1 finished with value: 0.7678717514895723 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2912}. Best is trial 0 with value: 0.7968530196240977.
[I 2022-06-11 21:57:39,767] Trial 2 finished with value: 0.7926360459711335 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 2834}. Best is trial 0 with value: 0.7968530196240977.
[I 2022-06-11 21:57:40,630] Trial 3 finished with value: 0.7818653309434953 and parameters: {'criterion': 'friedman_mse', 'splitter': 'random', 'max_leaf_nodes': 434}. Best is trial 0 with value: 0.7968530196240977.
[I 2022-06-11 21:57:42,053] Trial 4 finished with value: 0.8026220034138282 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 1177}. Best is trial 4 with value: 0.8026220034138282.
[I 2022-06-11 21:57:43,311] Trial 5 finished with value: 0.7879497128602452 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2740}. Best is trial 4 with value: 0.8026220034138282.
[I 2022-06-11 21:57:44,588] Trial 6 finished with value: 0.7887850124445721 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 2765}. Best is trial 4 with value: 0.8026220034138282.
[I 2022-06-11 21:57:45,762] Trial 7 finished with value: 0.8045119069173114 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 663}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:47,138] Trial 8 finished with value: 0.790289145586975 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 1061}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:48,111] Trial 9 finished with value: 0.7909316891334115 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best', 'max_leaf_nodes': 392}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:49,737] Trial 10 finished with value: 0.8036756154291794 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1867}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:51,292] Trial 11 finished with value: 0.8040953689108153 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1871}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:52,744] Trial 12 finished with value: 0.7988572065627937 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 1355}. Best is trial 7 with value: 0.8045119069173114.
[I 2022-06-11 21:57:54,081] Trial 13 finished with value: 0.8050947771752798 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:57:55,339] Trial 14 finished with value: 0.798618611006402 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 757}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:57:56,001] Trial 15 finished with value: 0.8023047624334361 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 152}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:57:57,248] Trial 16 finished with value: 0.80498076851405 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 766}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:57:58,580] Trial 17 finished with value: 0.8041300480080753 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 933}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:57:59,799] Trial 18 finished with value: 0.7757464455049321 and parameters: {'criterion': 'squared_error', 'splitter': 'random', 'max_leaf_nodes': 1491}. Best is trial 13 with value: 0.8050947771752798.
[I 2022-06-11 21:58:00,448] Trial 19 finished with value: 0.8008839608226905 and parameters: {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 135}. Best is trial 13 with value: 0.8050947771752798.
In [12]:
optuna.visualization.plot_param_importances(study)
In [13]:
optuna.visualization.plot_optimization_history(study)
In [14]:
dt_params
Out[14]:
{'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}
In [ ]:
dt_params = {'criterion': 'squared_error', 'splitter': 'best', 'max_leaf_nodes': 733}
In [15]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(**dt_params)

# Bundle preprocessing and modeling code in a pipeline
dt_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dt_model)
])

dt_pipe.fit(X_train, y_train)
Out[15]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', DecisionTreeRegressor(max_leaf_nodes=733))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', DecisionTreeRegressor(max_leaf_nodes=733))])
ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 Index(['location'], dtype='object'))])
Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
DecisionTreeRegressor(max_leaf_nodes=733)
In [16]:
pd.Series({
    "Train Score": dt_pipe.score(X_train, y_train),
    "Test Score": dt_pipe.score(X_test, y_test)
})
Out[16]:
Train Score    0.988917
Test Score     0.899828
dtype: float64
In [17]:
dt_cv_score = cross_val_score(dt_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", dt_cv_score, "\n",
    "Cross Validation Mean Score \t", dt_cv_score.mean(), sep=""
)
Cross Validation Score 		[0.80737992 0.85251181 0.82975466 0.60047708 0.86708361]
Cross Validation Mean Score 	0.7914414157443508

Random Forest¶

In [ ]:
"""We are not hypertuning RandomForestRegressor because it's going to take a long time
from sklearn.ensemble import RandomForestRegressor

def objective(trial):
    rf_params = dict(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        criterion=trial.suggest_categorical('criterion', ["squared_error", "absolute_error"]),
        max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 100, 3000)
    )
    
    rf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(**rf_params))
    ])
    
    return cross_val_score(rf, X_train, y_train, cv=cv).mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
rf_params = study.best_params
"""
In [18]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

# Bundle preprocessing and modeling code in a pipeline
rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

rf_pipe.fit(X_train, y_train)
Out[18]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', RandomForestRegressor())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model', RandomForestRegressor())])
ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 Index(['location'], dtype='object'))])
Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor()
In [19]:
pd.Series({
    "Train Score": rf_pipe.score(X_train, y_train),
    "Test Score": rf_pipe.score(X_test, y_test)
})
Out[19]:
Train Score    0.962790
Test Score     0.888108
dtype: float64
In [20]:
rf_cv_score = cross_val_score(rf_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", rf_cv_score, "\n",
    "Cross Validation Mean Score \t", rf_cv_score.mean(), sep=""
)
Cross Validation Score 		[0.77141246 0.84309946 0.85190028 0.61993243 0.89916466]
Cross Validation Mean Score 	0.7971018606826477

XGBoost¶

In [21]:
from xgboost import XGBRegressor

def objective(trial):
    xgb_params = dict(
        learning_rate=trial.suggest_float("learning_rate", 1e-2, 1e-1, log=True),
        n_estimators=trial.suggest_int("n_estimators", 100, 500)
    )
    
    xgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(**xgb_params))
    ])
    
    xgb.fit(
        X_train, y_train,
        model__early_stopping_rounds=100,
        model__eval_set=[(eval_test, y_test)],
        model__verbose=False
    )
    return cross_val_score(xgb, X_train, y_train, cv=cv).mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
xgb_params = study.best_params
C:\Users\User\anaconda3\envs\data-science\lib\site-packages\xgboost\compat.py:36: FutureWarning:

pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.

[I 2022-06-11 22:03:37,233] A new study created in memory with name: no-name-ccccb1da-441e-4708-8135-7051f5fb2226
[I 2022-06-11 22:03:53,622] Trial 0 finished with value: 0.7843913570955278 and parameters: {'learning_rate': 0.01826119870880335, 'n_estimators': 260}. Best is trial 0 with value: 0.7843913570955278.
[I 2022-06-11 22:04:17,672] Trial 1 finished with value: 0.8089597798789938 and parameters: {'learning_rate': 0.022755442929083564, 'n_estimators': 387}. Best is trial 1 with value: 0.8089597798789938.
[I 2022-06-11 22:04:43,016] Trial 2 finished with value: 0.7939356339224122 and parameters: {'learning_rate': 0.01507936473215109, 'n_estimators': 386}. Best is trial 1 with value: 0.8089597798789938.
[I 2022-06-11 22:05:06,307] Trial 3 finished with value: 0.8000568025622494 and parameters: {'learning_rate': 0.01897567434963167, 'n_estimators': 358}. Best is trial 1 with value: 0.8089597798789938.
[I 2022-06-11 22:05:24,091] Trial 4 finished with value: 0.8199828063157588 and parameters: {'learning_rate': 0.04828024435485572, 'n_estimators': 274}. Best is trial 4 with value: 0.8199828063157588.
[I 2022-06-11 22:05:32,821] Trial 5 finished with value: 0.7678444922582894 and parameters: {'learning_rate': 0.02819118930965911, 'n_estimators': 126}. Best is trial 4 with value: 0.8199828063157588.
[I 2022-06-11 22:05:40,562] Trial 6 finished with value: 0.7101888081996007 and parameters: {'learning_rate': 0.01904233173440493, 'n_estimators': 109}. Best is trial 4 with value: 0.8199828063157588.
[I 2022-06-11 22:05:57,002] Trial 7 finished with value: 0.8371117497873091 and parameters: {'learning_rate': 0.09710329326224534, 'n_estimators': 253}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:06:24,256] Trial 8 finished with value: 0.8228104378608971 and parameters: {'learning_rate': 0.035197614252757095, 'n_estimators': 428}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:06:55,735] Trial 9 finished with value: 0.8059293336732521 and parameters: {'learning_rate': 0.016226617718595095, 'n_estimators': 489}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:07:08,898] Trial 10 finished with value: 0.8330067893954605 and parameters: {'learning_rate': 0.09691256210385771, 'n_estimators': 204}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:07:21,512] Trial 11 finished with value: 0.8268931534729113 and parameters: {'learning_rate': 0.09823212106053601, 'n_estimators': 191}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:07:34,172] Trial 12 finished with value: 0.8283738175908549 and parameters: {'learning_rate': 0.09947709602579106, 'n_estimators': 200}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:07:48,509] Trial 13 finished with value: 0.8212699276036867 and parameters: {'learning_rate': 0.06312460235460259, 'n_estimators': 217}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:08:08,475] Trial 14 finished with value: 0.8302348469581364 and parameters: {'learning_rate': 0.06772987781045565, 'n_estimators': 306}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:08:19,100] Trial 15 finished with value: 0.6546668604234721 and parameters: {'learning_rate': 0.01093480355775935, 'n_estimators': 152}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:08:39,295] Trial 16 finished with value: 0.8324266153833081 and parameters: {'learning_rate': 0.07314122632985683, 'n_estimators': 314}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:08:55,020] Trial 17 finished with value: 0.817071851028552 and parameters: {'learning_rate': 0.04847666310928432, 'n_estimators': 241}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:09:06,755] Trial 18 finished with value: 0.8062879333357265 and parameters: {'learning_rate': 0.04834862855631613, 'n_estimators': 168}. Best is trial 7 with value: 0.8371117497873091.
[I 2022-06-11 22:09:27,891] Trial 19 finished with value: 0.832840296714345 and parameters: {'learning_rate': 0.07806085341112905, 'n_estimators': 325}. Best is trial 7 with value: 0.8371117497873091.
In [46]:
optuna.visualization.plot_param_importances(study)
In [47]:
optuna.visualization.plot_optimization_history(study)
In [28]:
xgb_params
Out[28]:
{'learning_rate': 0.09666399416904463, 'n_estimators': 364}
In [25]:
xgb_params = {'learning_rate': 0.09666399416904463, 'n_estimators': 364}
In [29]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(**xgb_params)

# Bundle preprocessing and modeling code in a pipeline
xgb_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb_model)
])

xgb_pipe.fit(
    X_train, y_train,
    model__early_stopping_rounds=100,
    model__eval_set=[(eval_test, y_test)],
    model__verbose=False
)
Out[29]:
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model',
                 XGBRe...
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.09666399416904463,
                              max_delta_step=0, max_depth=6, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=364, n_jobs=8, num_parallel_tree=1,
                              predictor='auto', random_state=0, reg_alpha=0,
                              reg_lambda=1, scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='constant'),
                                                  Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['location'], dtype='object'))])),
                ('model',
                 XGBRe...
                              gamma=0, gpu_id=-1, importance_type=None,
                              interaction_constraints='',
                              learning_rate=0.09666399416904463,
                              max_delta_step=0, max_depth=6, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=364, n_jobs=8, num_parallel_tree=1,
                              predictor='auto', random_state=0, reg_alpha=0,
                              reg_lambda=1, scale_pos_weight=1, subsample=1,
                              tree_method='exact', validate_parameters=1,
                              verbosity=None))])
ColumnTransformer(transformers=[('num', SimpleImputer(strategy='constant'),
                                 Index(['total_sqft', 'bath', 'bhk'], dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 Index(['location'], dtype='object'))])
Index(['total_sqft', 'bath', 'bhk'], dtype='object')
SimpleImputer(strategy='constant')
Index(['location'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore', sparse=False)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.09666399416904463,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=364, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

Train Test Score¶

In [31]:
pd.Series({
    "Train Score": xgb_pipe.score(X_train, y_train),
    "Test Score": xgb_pipe.score(X_test, y_test)
})
Out[31]:
Train Score    0.974757
Test Score     0.934992
dtype: float64

Cross Validation Score¶

In [32]:
xgb_cv_score = cross_val_score(xgb_pipe, X_train, y_train, cv=cv)

print(
    "Cross Validation Score \t\t", xgb_cv_score, "\n",
    "Cross Validation Mean Score \t", xgb_cv_score.mean(), sep=""
)
Cross Validation Score 		[0.88023795 0.88931915 0.89865116 0.62436034 0.91485851]
Cross Validation Mean Score 	0.8414854226237235

Comparing Models on Test data¶

In [51]:
scores = []
models = [lr_pipe, dt_pipe, rf_pipe, xgb_pipe]
for model in models:
    scores.append({
        'Model': str(model.named_steps.model).split('(')[0],
        'r2 Score': r2_score(y_test, model.predict(X_test)),
        'MAE Score': mean_absolute_error(y_test, model.predict(X_test)),
        'MSE Score': mean_squared_error(y_test, model.predict(X_test))
    })

pd.DataFrame(scores).set_index('Model')
Out[51]:
r2 Score MAE Score MSE Score
Model
LinearRegression 0.790460 16.634335 1979.243270
DecisionTreeRegressor 0.899828 16.213864 946.188374
RandomForestRegressor 0.888108 15.006293 1056.890782
XGBRegressor 0.934992 13.591667 614.045627

Exporting Model Pipeline as Pickle¶

In [52]:
with open("model.pkl", "wb") as f:
    pickle.dump(xgb_pipe, f)